tr <- fread("train.csv", header = T, showProgress = F)
te <- fread("test.csv", header = T, showProgress = F)
target <- tr$target
tr$target <- NULL
summary(as.factor(target))
## 0 1
## 179902 20098
tr$ID_code <- NULL
te$ID_code <- NULL
plot_str(tr)
plot_str(te)
only_contiguous
introduce(tr)
## rows columns discrete_columns continuous_columns all_missing_columns
## 1: 200000 200 0 200 0
## total_missing_values complete_rows total_observations memory_usage
## 1: 0 200000 40000000 320037768
introduce(te)
## rows columns discrete_columns continuous_columns all_missing_columns
## 1: 200000 200 0 200 0
## total_missing_values complete_rows total_observations memory_usage
## 1: 0 200000 40000000 320037768
plot_missing(tr)
No missing
plot_missing(te)
No missing
tr %>%
cor(use="complete.obs") %>%
corrplot(type="lower", diag=FALSE)
te %>%
cor(use="complete.obs") %>%
corrplot(type="lower", diag=FALSE)
No corrlation (Nomarized data?)
train <- fread("train.csv", header = T, showProgress = F)
feature_groups <- 3:22
col_names <- colnames(train)[c(2,feature_groups)]
temp <- gather(train[,col_names, with=F], key="features", value="value", -target)
temp$target <- factor(temp$target)
temp$features <- factor(temp$features, levels=col_names[-1], labels=col_names[-1])
ggplot(data=temp, aes(x=value)) +
geom_density(aes(fill=target, color=target), alpha=0.3) +
scale_color_manual(values = c("1" = "dodgerblue", "0"="firebrick1")) +
theme_tufte() +
facet_wrap(~ features, ncol = 4, scales = "free")
var0, 1, 2, 6, 9, 12, 13, 14, 18
col_names <- colnames(train)[c(2,feature_groups+20)]
temp <- gather(train[,col_names, with=F], key="features", value="value", -target)
temp$target <- factor(temp$target)
temp$features <- factor(temp$features, levels=col_names[-1], labels=col_names[-1])
ggplot(data=temp, aes(x=value)) +
geom_density(aes(fill=target, color=target), alpha=0.3) +
scale_color_manual(values = c("1" = "dodgerblue", "0"="firebrick1")) +
theme_tufte() +
facet_wrap(~ features, ncol = 4, scales = "free")
ver21, 22, 26, 33, 34, 35, 36
col_names <- colnames(train)[c(2,feature_groups+40)]
temp <- gather(train[,col_names, with=F], key="features", value="value", -target)
temp$target <- factor(temp$target)
temp$features <- factor(temp$features, levels=col_names[-1], labels=col_names[-1])
ggplot(data=temp, aes(x=value)) +
geom_density(aes(fill=target, color=target), alpha=0.3) +
scale_color_manual(values = c("1" = "dodgerblue", "0"="firebrick1")) +
theme_tufte() +
facet_wrap(~ features, ncol = 4, scales = "free")
ver40, 41, 44, 48, 52, 53, 55
col_names <- colnames(train)[c(2,feature_groups+60)]
temp <- gather(train[,col_names, with=F], key="features", value="value", -target)
temp$target <- factor(temp$target)
temp$features <- factor(temp$features, levels=col_names[-1], labels=col_names[-1])
ggplot(data=temp, aes(x=value)) +
geom_density(aes(fill=target, color=target), alpha=0.3) +
scale_color_manual(values = c("1" = "dodgerblue", "0"="firebrick1")) +
theme_tufte() +
facet_wrap(~ features, ncol = 4, scales = "free")
ver 66, 67, 71, 75, 76, 78
col_names <- colnames(train)[c(2,feature_groups+80)]
temp <- gather(train[,col_names, with=F], key="features", value="value", -target)
temp$target <- factor(temp$target)
temp$features <- factor(temp$features, levels=col_names[-1], labels=col_names[-1])
ggplot(data=temp, aes(x=value)) +
geom_density(aes(fill=target, color=target), alpha=0.3) +
scale_color_manual(values = c("1" = "dodgerblue", "0"="firebrick1")) +
theme_tufte() +
facet_wrap(~ features, ncol = 4, scales = "free")
ver80, 81, 86, 92, 93, 94, 95, 99
col_names <- colnames(train)[c(2,feature_groups+100)]
temp <- gather(train[,col_names, with=F], key="features", value="value", -target)
temp$target <- factor(temp$target)
temp$features <- factor(temp$features, levels=col_names[-1], labels=col_names[-1])
ggplot(data=temp, aes(x=value)) +
geom_density(aes(fill=target, color=target), alpha=0.3) +
scale_color_manual(values = c("1" = "dodgerblue", "0"="firebrick1")) +
theme_tufte() +
facet_wrap(~ features, ncol = 4, scales = "free")
ver108, 109, 110, 115, 116, 118, 119
col_names <- colnames(train)[c(2,feature_groups+120)]
temp <- gather(train[,col_names, with=F], key="features", value="value", -target)
temp$target <- factor(temp$target)
temp$features <- factor(temp$features, levels=col_names[-1], labels=col_names[-1])
ggplot(data=temp, aes(x=value)) +
geom_density(aes(fill=target, color=target), alpha=0.3) +
scale_color_manual(values = c("1" = "dodgerblue", "0"="firebrick1")) +
theme_tufte() +
facet_wrap(~ features, ncol = 4, scales = "free")
ver121, 122, 123, 125, 127, 130, 131, 132, 133, 135, 137, 139
col_names <- colnames(train)[c(2,feature_groups+140)]
temp <- gather(train[,col_names, with=F], key="features", value="value", -target)
temp$target <- factor(temp$target)
temp$features <- factor(temp$features, levels=col_names[-1], labels=col_names[-1])
ggplot(data=temp, aes(x=value)) +
geom_density(aes(fill=target, color=target), alpha=0.3) +
scale_color_manual(values = c("1" = "dodgerblue", "0"="firebrick1")) +
theme_tufte() +
facet_wrap(~ features, ncol = 4, scales = "free")
ver141, 146, 147, 148, 149, 154, 157
col_names <- colnames(train)[c(2,feature_groups+160)]
temp <- gather(train[,col_names, with=F], key="features", value="value", -target)
temp$target <- factor(temp$target)
temp$features <- factor(temp$features, levels=col_names[-1], labels=col_names[-1])
ggplot(data=temp, aes(x=value)) +
geom_density(aes(fill=target, color=target), alpha=0.3) +
scale_color_manual(values = c("1" = "dodgerblue", "0"="firebrick1")) +
theme_tufte() +
facet_wrap(~ features, ncol = 4, scales = "free")
ver 163, 164, 165, 166, 169, 170, 172, 173, 174, 177, 179
col_names <- colnames(train)[c(2,feature_groups+180)]
temp <- gather(train[,col_names, with=F], key="features", value="value", -target)
temp$target <- factor(temp$target)
temp$features <- factor(temp$features, levels=col_names[-1], labels=col_names[-1])
ggplot(data=temp, aes(x=value)) +
geom_density(aes(fill=target, color=target), alpha=0.3) +
scale_color_manual(values = c("1" = "dodgerblue", "0"="firebrick1")) +
theme_tufte() +
facet_wrap(~ features, ncol = 4, scales = "free")
ver180, 184, 188, 190, 191, 192, 198